{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyarrow.parquet as pq\n",
    "\n",
    "import transformers\n",
    "import numpy as np\n",
    "import json\n",
    "\n",
    "\n",
    "DATASET = \"../../dataset_processed_v3/sharegpt_v3.2.train.parquet\"\n",
    "TOKENIZER_NAME = \"imone/LLaMA2_13B_with_EOT_token\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load dataset\n",
    "dataset = pq.read_table(DATASET)\n",
    "\n",
    "# Load tokenizer\n",
    "tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "18 1911.0328626542228 4096\n",
      "[21436 27808 12549 74425 34001 22231 12524  5758 53903 36293 32039 34018\n",
      " 12534 61012 36128 67877 59158 19842 39765 25296 32036  3156 54101 69927\n",
      " 10372 41171 77743 28537 24479 39592 50125 12681  8676 76826 64141 71009\n",
      " 70235 59193  1666 22873 45708 45607 56562 22487 27296 22017 48507 28541\n",
      "   909 20975 20955 35385 55246 20921 48195  1823  7913 50270 77569 26721\n",
      " 37832 12089 55575 32153 63153 54966 21406 77440 19843 63211 73818 21762\n",
      " 45065 63244  4330 11884   256 39383 10885  2080 59535 60511 63385  7756\n",
      " 56739 23108 75592  5754 40388  8506 67092 13778 47203 66085 61831 33750\n",
      " 52926  9651 52932 25011]\n"
     ]
    }
   ],
   "source": [
    "# Length statistics\n",
    "lengths = np.array(dataset.column(\"total_length\"))\n",
    "\n",
    "print (np.min(lengths), np.mean(lengths), np.max(lengths))\n",
    "\n",
    "print (np.argsort(lengths)[:100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'total_length': [961], 'num_seqs': [1], '0_tokens': [[1, 402, 7982, 29941, 4911, 29901, 7251, 32000, 402, 7982, 29941, 4007, 22137, 29901, 15043, 29991, 1128, 508, 306, 6985, 366, 9826, 29973, 32000, 402, 7982, 29941, 4911, 29901, 437, 366, 1073, 10377, 5847, 32000, 402, 7982, 29941, 4007, 22137, 29901, 3869, 29892, 306, 437, 29991, 5641, 9998, 15028, 363, 4124, 20334, 300, 653, 3497, 2184, 29889, 739, 338, 263, 9608, 322, 3564, 8688, 304, 1653, 263, 23533, 29899, 517, 29899, 412, 261, 1158, 310, 15446, 322, 19383, 11266, 9799, 297, 263, 13235, 934, 1788, 29889, 512, 916, 3838, 29892, 5641, 9998, 6511, 366, 304, 3787, 322, 6232, 2066, 297, 263, 27189, 1705, 1891, 8214, 29892, 1728, 337, 5890, 373, 263, 6555, 1891, 1923, 29889, 910, 508, 367, 5407, 363, 8324, 988, 848, 4225, 304, 367, 3625, 1584, 565, 278, 2441, 1923, 5771, 1623, 470, 338, 12522, 1623, 29892, 408, 1532, 408, 363, 8324, 988, 4160, 864, 304, 6232, 2919, 2066, 1728, 5528, 1038, 292, 278, 21544, 310, 6555, 1891, 8635, 322, 3719, 2103, 29889, 32000, 402, 7982, 29941, 4911, 29901, 920, 437, 366, 3440, 372, 32000, 402, 7982, 29941, 4007, 22137, 29901, 1094, 385, 319, 29902, 4086, 1904, 29892, 306, 1016, 29915, 29873, 505, 7333, 26971, 470, 17750, 29879, 1048, 5722, 11763, 29892, 541, 306, 508, 3867, 366, 411, 777, 2498, 2472, 322, 7037, 23633, 310, 773, 5641, 9998, 29889, 13, 13, 6716, 310, 278, 1667, 23633, 310, 5641, 9998, 338, 393, 372, 6511, 363, 27189, 1705, 1891, 8635, 322, 19383, 310, 2793, 29892, 607, 508, 10032, 12536, 749, 373, 6555, 1891, 12424, 322, 19998, 7910, 620, 2638, 663, 322, 20847, 3097, 310, 848, 29889, 7280, 14169, 338, 393, 372, 508, 19998, 10032, 278, 3438, 322, 13644, 310, 2793, 28289, 491, 14372, 4160, 304, 9080, 2793, 304, 1269, 916, 297, 263, 23533, 29899, 517, 29899, 412, 261, 8214, 29889, 13, 13, 17245, 29892, 763, 738, 15483, 29892, 5641, 9998, 756, 967, 27028, 322, 11302, 29899, 22450, 29892, 322, 3692, 470, 451, 372, 338, 278, 1900, 7348, 363, 263, 3153, 671, 1206, 7111, 373, 263, 12875, 310, 13879, 29889, 1152, 1342, 29892, 5641, 9998, 338, 1603, 263, 13774, 716, 15483, 29892, 322, 372, 1122, 451, 367, 408, 286, 1535, 470, 1532, 29899, 23765, 408, 901, 7841, 27809, 29889, 19814, 29892, 727, 1122, 367, 11302, 29899, 22450, 297, 4958, 310, 4180, 470, 6993, 746, 773, 263, 27189, 1705, 1891, 1788, 763, 5641, 9998, 29889, 13, 13, 797, 15837, 29892, 5641, 9998, 338, 263, 15483, 411, 967, 1914, 731, 310, 23633, 322, 27028, 29892, 322, 3692, 470, 451, 372, 338, 263, 1781, 6216, 363, 263, 3153, 671, 1206, 7111, 373, 263, 12875, 310, 13879, 29889, 32000, 402, 7982, 29941, 4911, 29901, 306, 1016, 29915, 29873, 3755, 2274, 29892, 1058, 3787, 278, 2066, 363, 10377, 5847, 32000, 402, 7982, 29941, 4007, 22137, 29901, 5641, 9998, 338, 263, 23533, 29899, 517, 29899, 412, 261, 313, 29925, 29906, 29925, 29897, 3564, 29892, 607, 2794, 393, 278, 2066, 526, 6087, 373, 278, 23226, 310, 278, 3564, 27138, 6053, 29889, 1932, 366, 788, 263, 934, 304, 5641, 9998, 29892, 372, 4947, 9391, 701, 964, 7968, 521, 18801, 29892, 322, 1269, 19875, 338, 9859, 263, 5412, 15882, 2000, 263, 2793, 3211, 29892, 607, 338, 263, 24941, 12122, 6608, 310, 278, 2793, 310, 278, 19875, 29889, 4525, 521, 18801, 526, 769, 13235, 4822, 278, 3564, 322, 6087, 373, 278, 7573, 310, 916, 27138, 1058, 526, 2734, 5641, 9998, 29889, 13, 13, 10401, 366, 2009, 263, 934, 373, 5641, 9998, 29892, 596, 2943, 3430, 363, 278, 521, 18801, 310, 278, 934, 373, 278, 3564, 322, 337, 465, 1590, 793, 963, 964, 278, 2441, 934, 29889, 7311, 278, 2793, 3211, 310, 278, 934, 9242, 278, 1021, 17126, 310, 988, 372, 338, 6087, 373, 278, 3564, 29892, 366, 508, 10563, 278, 934, 515, 738, 2943, 393, 756, 372, 29892, 3265, 1135, 2534, 304, 19104, 373, 263, 2323, 6555, 1891, 1923, 29889, 13, 13, 6295, 297, 3273, 29892, 278, 2066, 373, 5641, 9998, 526, 6087, 322, 13235, 4822, 278, 3564, 310, 5641, 9998, 7573, 29892, 607, 526, 13674, 278, 23226, 310, 278, 2305, 1058, 526, 5221, 1218, 297, 278, 3564, 29889, 32000, 402, 7982, 29941, 4911, 29901, 947, 393, 2099, 10377, 5847, 674, 2125, 701, 1407, 1568, 8086, 2913, 373, 590, 1887, 6601, 29914, 32000, 402, 7982, 29941, 4007, 22137, 29901, 1932, 366, 671, 5641, 9998, 29892, 596, 2943, 674, 3787, 263, 3509, 310, 278, 2066, 393, 366, 2130, 470, 12534, 29889, 2648, 2322, 29892, 596, 2943, 674, 884, 3787, 263, 7090, 310, 10325, 20592, 2066, 29892, 607, 508, 2125, 701, 8086, 2913, 975, 931, 29889, 13, 13, 17245, 29892, 366, 508, 2761, 920, 1568, 8086, 2913, 5641, 9998, 3913, 373, 596, 1887, 6601, 491, 10365, 292, 278, 6055, 310, 596, 5641, 9998, 2943, 29889, 1152, 1342, 29892, 366, 508, 731, 263, 4046, 373, 278, 5253, 310, 8086, 2913, 393, 5641, 9998, 338, 6068, 304, 671, 29892, 322, 366, 508, 884, 10822, 5641, 9998, 304, 6336, 3349, 2066, 515, 596, 1887, 8635, 565, 896, 505, 451, 1063, 20592, 363, 263, 3058, 3785, 310, 931, 29889, 13, 13, 797, 6124, 29892, 366, 508, 6755, 304, 1065, 5641, 9998, 297, 376, 2696, 1220, 29908, 4464, 29892, 607, 2794, 393, 596, 2943, 674, 451, 3787, 738, 2066, 12430, 322, 674, 871, 10563, 963, 515, 916, 7573, 373, 278, 3564, 746, 366, 2009, 963, 29889, 910, 508, 367, 5407, 565, 366, 526, 15041, 1048, 8086, 2913, 8744, 29889, 13, 13, 3563, 497, 29892, 278, 5253, 310, 8086, 2913, 393, 5641, 9998, 3913, 373, 596, 1887, 6601, 674, 8839, 373, 920, 1568, 2793, 366, 2130, 322, 920, 366, 10822, 596, 2943, 29915, 29879, 6055, 29889, 2973, 278, 1492, 6055, 29892, 3138, 29892, 366, 508, 2761, 278, 5253, 310, 8086, 2913, 393, 5641, 9998, 3913, 322, 5557, 372, 515, 5622, 701, 2086, 1568, 2913, 373, 596, 1887, 6601, 29889, 32000]], '0_masks': [[False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]], '1_tokens': [[]], '1_masks': [[]], 'total_loss_weight': [1.0842387676239014]}\n",
      "<s> GPT3 User: hi<|end_of_turn|>\n",
      "\n",
      " GPT3 Assistant: GPT3 User: do you know ipfs<|end_of_turn|>\n",
      "\n",
      " GPT3 Assistant: GPT3 User: how do you comment it<|end_of_turn|>\n",
      "\n",
      " GPT3 Assistant: GPT3 User: I don't quite understand, who store the files for ipfs<|end_of_turn|>\n",
      "\n",
      " GPT3 Assistant: GPT3 User: does that mean ipfs will take up very much disk space on my local computer/<|end_of_turn|>\n",
      "\n",
      " GPT3 Assistant:\n",
      "=================\n",
      "Hello! How can I assist you today?<|end_of_turn|>\n",
      "\n",
      " Yes, I do! IPFS stands for InterPlanetary File System. It is a protocol and network designed to create a peer-to-peer method of storing and sharing hypermedia in a distributed file system. In other words, IPFS allows you to store and share files in a decentralized manner, without relying on a centralized server. This can be useful for applications where data needs to be available even if the original server goes down or is shut down, as well as for applications where users want to share large files without incurring the costs of centralized storage and bandwidth.<|end_of_turn|>\n",
      "\n",
      " As an AI language model, I don't have personal opinions or beliefs about technologies, but I can provide you with some general information and potential benefits of using IPFS.\n",
      "\n",
      "One of the main benefits of IPFS is that it allows for decentralized storage and sharing of content, which can reduce reliance on centralized servers and potentially increase resilience and availability of data. Another benefit is that it can potentially reduce the cost and complexity of content delivery by allowing users to serve content to each other in a peer-to-peer manner.\n",
      "\n",
      "However, like any technology, IPFS has its limitations and trade-offs, and whether or not it is the best choice for a particular use case depends on a variety of factors. For example, IPFS is still a relatively new technology, and it may not be as mature or well-supported as more established alternatives. Additionally, there may be trade-offs in terms of performance or security when using a decentralized system like IPFS.\n",
      "\n",
      "In summary, IPFS is a technology with its own set of benefits and limitations, and whether or not it is a good fit for a particular use case depends on a variety of factors.<|end_of_turn|>\n",
      "\n",
      " IPFS is a peer-to-peer (P2P) network, which means that the files are stored on the computers of the network participants themselves. When you add a file to IPFS, it gets broken up into smaller chunks, and each chunk is assigned a unique identifier called a content address, which is a cryptographic hash of the content of the chunk. These chunks are then distributed across the network and stored on the nodes of other participants who are running IPFS.\n",
      "\n",
      "When you request a file on IPFS, your node looks for the chunks of the file on the network and reassembles them into the original file. Because the content address of the file remains the same regardless of where it is stored on the network, you can retrieve the file from any node that has it, rather than having to rely on a single centralized server.\n",
      "\n",
      "So in short, the files on IPFS are stored and distributed across the network of IPFS nodes, which are essentially the computers of the people who are participating in the network.<|end_of_turn|>\n",
      "\n",
      " When you use IPFS, your node will store a copy of the files that you access or pin. By default, your node will also store a cache of recently accessed files, which can take up disk space over time.\n",
      "\n",
      "However, you can control how much disk space IPFS uses on your local computer by adjusting the settings of your IPFS node. For example, you can set a limit on the amount of disk space that IPFS is allowed to use, and you can also configure IPFS to automatically remove files from your local storage if they have not been accessed for a certain period of time.\n",
      "\n",
      "In addition, you can choose to run IPFS in \"offline\" mode, which means that your node will not store any files locally and will only retrieve them from other nodes on the network when you request them. This can be useful if you are concerned about disk space usage.\n",
      "\n",
      "Overall, the amount of disk space that IPFS uses on your local computer will depend on how much content you access and how you configure your node's settings. With the right settings, however, you can control the amount of disk space that IPFS uses and prevent it from taking up too much space on your local computer.<|end_of_turn|>\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Print the supervised and unsupervised text\n",
    "sample_index = 21204\n",
    "\n",
    "sample = dataset.take([sample_index]).to_pydict()\n",
    "\n",
    "token_ids = np.array(sample[\"0_tokens\"])\n",
    "masks     = np.array(sample[\"0_masks\"])\n",
    "\n",
    "print(sample)\n",
    "\n",
    "print(tokenizer.decode(token_ids[~masks].tolist(), spaces_between_special_tokens=False).replace(\"<|end_of_turn|>\", \"<|end_of_turn|>\\n\\n\"))\n",
    "print(\"=================\")\n",
    "print(tokenizer.decode(token_ids[masks].tolist(), spaces_between_special_tokens=False).replace(\"<|end_of_turn|>\", \"<|end_of_turn|>\\n\\n\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
